

## this file is designed to be invoked  after standalone.R

#table(packageDF$maintainership_type)
#packageDF$maintainership_type <- factor(packageDF$maintainership_type, levels= c("Loose", "Team", "Solo", "Team Affiliated Solo"))

#sample
#fit <- euler(c(A = 450, B = 1800, C = 500, "A&B" = 230, "A&C"=10, "A&B&C"=200))
#plot(fit)


## h1: underprod is about old packages and old languages


trimPKDF <- data.frame(package = packageDF$package, up.fac.mean = packageDF$up.fac.mean, c = packageDF$c, c.. = packageDF$c.., 
                         java=  packageDF$java, shell= packageDF$shell, python = packageDF$python, perl= packageDF$perl, lisp= packageDF$lisp)
trimPKDF <- subset(trimPKDF, trimPKDF$c != 0 | trimPKDF$c.. != 0 | trimPKDF$java!= 0 | trimPKDF$shell != 0 | trimPKDF$python != 0 | trimPKDF$perl!=0 | trimPKDF$lisp != 0)
## data is wide, now making it long

smallLangList <- c('c', 'c..', 'java', 'shell', 'python', 'perl', 'lisp')

trimPKDF.long <- reshape(trimPKDF, varying=smallLangList, v.names='proportion', timevar="lang", times=smallLangList, direction='long')
trimPKDF.long$id <- NULL #don't need

trimPKDF.long <- subset(trimPKDF.long, trimPKDF.long$proportion != 0) #240 pkgs appear twice 
trimPKDF.long.strict <- subset(trimPKDF.long, trimPKDF.long$proportion == 1) #135 pkgs appear twice

trimPKDF.long.strict$lang <- factor(trimPKDF.long.strict$lang, levels=c('lisp', 'c', 'shell', 'c..', 'perl', 'python', 'java'))
trimPKDF.long.strict$lang <- recode(trimPKDF.long.strict$lang, c="C", c..="C++", shell="Shell", lisp="Lisp", perl="Perl", python="Python", java="Java")

g <- ggplot(trimPKDF.long.strict, aes(x=lang, y=up.fac.mean, color=lang, fill=lang)) +
    #geom_boxplot() +
  geom_violin() +
  labs(x="Language", y="Underproduction Factor") +
  theme_bw() +
  theme(legend.position = 'none')   
g


pdf(paste0(figDir, "byLang.pdf"), width = 5, height = 3)
g
dev.off()


g <- ggplot(packageDF, aes(x=yearsOld, y=up.fac.mean)) +
  labs(x='Years Old', y="Underproduction Factor", color="Installation Rank") + ## TODO installation rank?? not in aes???
  theme_bw() +
  theme(legend.position = 'bottom') +
  geom_point(alpha=0.05) +
  geom_smooth()
g

pdf(paste0(figDir, "byBirthday.pdf"), width = 5, height = 3)
g
dev.off()

############## H2 is all about maintainer count

#q <- subset(packageDF, packageDF$maintainership_type == 'Loose')
#hist(log1p(q$maintainerCount))
#hist(log1p(q$uploaderCount))

g <- ggplot(packageDF, aes(y=up.fac.mean, x=as.factor(maintainerCount), group=maintainerCount, color=maintainerCount, fill=maintainerCount)) +
  #geom_boxplot() +
  geom_violin() +
  labs(x='Number of Maintainers', y="Underproduction Factor", color="Number of Maintainers") +
  theme_bw() +
  theme(legend.position = 'none')   
#  facet_wrap(~ maintainership_type)

g

pdf(paste0(figDir, "maintainerCount.pdf"), width = 5, height = 3)
g
dev.off()


g <- ggplot(packageDF, aes(y=up.fac.mean, x=as.factor(uploaderCount), group=uploaderCount, color=uploaderCount, fill=uploaderCount)) +
  geom_violin() +
  labs(x='Number of Uploaders', y="Underproduction Factor", color="Number of Uploaders") +
  theme_bw() +
  theme(legend.position = 'none')  

g

pdf(paste0(figDir, "uploaderCount.pdf"), width = 5, height = 3)
g
dev.off()

g <- ggplot(packageDF, aes(y=Eig.comment, x=birthday, color=is_underprod)) +
  geom_point(alpha=0.05) +
  geom_smooth() +
  labs(y="Eigenvector Centrality", x="Introduced to Debian", color="Underproduced?") +
  theme_bw()  +
  theme(legend.position = "bottom")

g

pdf(paste0(figDir, "byEigCentralityCommentAge.pdf"))
g
dev.off()



g <- ggplot(preds, aes(x, predicted, color=group)) + 
  theme_bw() + 
  geom_ribbon(aes(ymin=conf.low, ymax=conf.high), fill='gray80', linewidth=0, alpha=0.5) +
  geom_line(aes(y=predicted)) + 
  scale_color_manual(labels=c("25 years old (Java)", "48 years old (C)"), values=c("blue", "red")) +
  xlim(0,25) +
  labs(color="Language Age", x="Package Age", y="Model Prediction of Underproduction")  +
  theme(legend.position = "bottom")

g  

png(paste0(figDir, "margEffAgeInteract.png"))
g
dev.off()


## HX is about inequality of participation

g <- ggplot(packageDF, aes(x=hhi.manual, y=underprod)) +
  labs(x='Inequality of Participation', y="Underproduced?") + ## TODO installation rank?? not in aes???
  theme_bw() +
  theme(legend.position = 'bottom') +
  geom_point(alpha=0.05) +
  geom_smooth()
g
